Importing Essential Libraries

In [2]:
import pandas as pd
import numpy as np
import seaborn as sb
import plotly.express as px
import matplotlib.pyplot as plot
from collections import Counter

Create Function

In [4]:
# most Value duplication
def most_common_Vales(value):
  value_counts = Counter(value)
  most_common, count = value_counts.most_common(1)[0]
  return most_common, count
In [5]:
# max value and count duplication
def max_count_value(value):
    Max_value=value.max()
    count_value_max=np.count_nonzero(value == Max_value)
    return Max_value , count_value_max
In [6]:
# min value and count duplication
def min_count_value(value):
    min_value=value.min()
    count_value_min=np.count_nonzero(value == min_value)
    return min_value , count_value_min
In [7]:
#describe df
def describe(col):
    if col.dtype !="object":
        des=(pd.DataFrame(col).describe().style.background_gradient(cmap='viridis', axis=1))
    else:
        des=(pd.DataFrame(col).describe(include="object").style.background_gradient(cmap='viridis', axis=1))
    return des
In [8]:
# Data trend 
def Data_trend(value):
    men=value.mean()
    miden_=value.median()
    if men > miden_ :
        data_trend= "Data Trend equel : Right"
    elif men < miden_ :
        data_trend= "Data Trend equel : Left"
    else :
        data_trend = "Data Trend equel : symmetric"
    return data_trend
In [9]:
def summary(value):
    # max value and count duplication
    Max_value  , count_value =max_count_value(value)
    print(f"The maximum value equals : {Max_value} --- The number of Value  {Max_value}  is equal {count_value}")
    print('*'*100)
    # min value and count duplication
    Min_value ,count_value =min_count_value(value)
    print(f"The minimum value equals : {Min_value} --- The number of value {Min_value}  is equal {count_value}")
    print('*'*100)
    # most value duplication
    most_common_value, count = most_common_Vales(value)
    print(f"The maximum duplication value equals : {most_common_value}  --- The number of duplications is greater than the duplication value is equal to : {count}")
In [10]:
def create_pairplot(df, numerical_cols):
    g = sns.pairplot(df[numerical_cols], diag_kind="kde")  # Use kdeplot for diagonal
    # Customize the plot using Matplotlib
    g.fig.suptitle('Pairplot of Numerical Features', y=1.02)
    plot.tight_layout()
    # Display the plot 
    plot.show()
In [11]:
def create_boxplots(df):
    fig, ax = plot.subplots(figsize=(10, 6))
    df.plot.box(vert=True, ax=ax)
    ax.set_title("Boxplot for All Numerical Columns")
    ax.set_xlabel("Value")
    ax.set_ylabel("Column Name")
    plot.show()

Read Data

In [13]:
df_LAP = pd.read_csv('Loan approval prediction.csv')
In [14]:
df=df_LAP.copy()
In [15]:
df.sample(n=25, random_state=49)
Out[15]:
id person_age person_income person_home_ownership person_emp_length loan_intent loan_grade loan_amnt loan_int_rate loan_percent_income cb_person_default_on_file cb_person_cred_hist_length loan_status
56796 56796 29 48000 MORTGAGE 5.0 PERSONAL A 10000 7.51 0.21 N 10 0
40685 40685 35 74000 RENT 15.0 EDUCATION C 15000 12.68 0.20 N 5 0
12572 12572 27 68000 MORTGAGE 11.0 HOMEIMPROVEMENT A 10750 7.51 0.16 N 5 0
2585 2585 25 90000 MORTGAGE 4.0 EDUCATION C 5500 12.73 0.06 Y 4 0
1032 1032 36 80000 RENT 6.0 EDUCATION A 10000 6.17 0.13 N 15 0
12271 12271 43 80000 MORTGAGE 1.0 VENTURE B 7800 11.49 0.09 N 11 0
53475 53475 36 55000 MORTGAGE 3.0 PERSONAL B 4000 10.59 0.07 N 17 0
11869 11869 25 87000 RENT 0.0 HOMEIMPROVEMENT A 8500 9.38 0.10 N 3 0
7035 7035 29 52000 MORTGAGE 3.0 VENTURE A 5000 5.99 0.10 N 5 0
25745 25745 27 90000 MORTGAGE 4.0 PERSONAL A 6000 7.51 0.07 N 8 0
24878 24878 23 105000 MORTGAGE 7.0 DEBTCONSOLIDATION A 12400 5.42 0.12 N 3 0
2144 2144 25 90000 MORTGAGE 9.0 HOMEIMPROVEMENT A 12000 6.92 0.13 N 4 0
20226 20226 28 24000 RENT 0.0 MEDICAL B 3000 12.42 0.13 N 6 0
38225 38225 31 100995 MORTGAGE 6.0 DEBTCONSOLIDATION A 16000 7.51 0.16 N 6 0
49636 49636 23 49000 RENT 6.0 PERSONAL C 11600 15.27 0.24 Y 2 0
40038 40038 27 95000 RENT 1.0 PERSONAL C 5600 14.65 0.05 Y 8 0
37747 37747 38 65000 MORTGAGE 1.0 EDUCATION B 9500 10.00 0.15 N 14 0
44723 44723 27 63000 RENT 3.0 MEDICAL C 7200 15.27 0.11 N 5 0
50655 50655 28 120000 MORTGAGE 0.0 MEDICAL B 15000 10.99 0.13 N 7 0
53879 53879 25 30000 RENT 3.0 VENTURE A 6400 7.49 0.22 N 4 0
22964 22964 32 69000 MORTGAGE 4.0 DEBTCONSOLIDATION B 12000 10.99 0.17 N 7 1
37494 37494 22 40000 RENT 1.0 DEBTCONSOLIDATION B 15000 10.65 0.38 N 2 1
57394 57394 25 60000 RENT 2.0 EDUCATION B 15000 12.53 0.25 N 3 0
55071 55071 22 85000 RENT 6.0 EDUCATION B 7500 10.08 0.09 N 3 0
42195 42195 29 36000 RENT 2.0 EDUCATION C 5500 13.57 0.15 N 7 0
In [16]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object(4)
memory usage: 5.8+ MB
In [17]:
#total null
df.isna().sum()
Out[17]:
id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64
In [18]:
#Total duplicated
print("duplicated = ",df.duplicated().sum())
duplicated =  0
In [19]:
#Segment data by Dtype
df=df.drop("id",axis=1)
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
object_cols=df.select_dtypes(include=['object']).columns
In [20]:
df.describe().style.background_gradient(cmap='viridis', axis=1)
Out[20]:
  person_age person_income person_emp_length loan_amnt loan_int_rate loan_percent_income cb_person_cred_hist_length loan_status
count 58645.000000 58645.000000 58645.000000 58645.000000 58645.000000 58645.000000 58645.000000 58645.000000
mean 27.550857 64046.172871 4.701015 9217.556518 10.677874 0.159238 5.813556 0.142382
std 6.033216 37931.106979 3.959784 5563.807384 3.034697 0.091692 4.029196 0.349445
min 20.000000 4200.000000 0.000000 500.000000 5.420000 0.000000 2.000000 0.000000
25% 23.000000 42000.000000 2.000000 5000.000000 7.880000 0.090000 3.000000 0.000000
50% 26.000000 58000.000000 4.000000 8000.000000 10.750000 0.140000 4.000000 0.000000
75% 30.000000 75600.000000 7.000000 12000.000000 12.990000 0.210000 8.000000 0.000000
max 123.000000 1900000.000000 123.000000 35000.000000 23.220000 0.830000 30.000000 1.000000
In [21]:
df.describe(include='object')
Out[21]:
person_home_ownership loan_intent loan_grade cb_person_default_on_file
count 58645 58645 58645 58645
unique 4 6 7 2
top RENT EDUCATION A N
freq 30594 12271 20984 49943
In [22]:
#Data trend 
trend=[]
col=[]
for i in df.columns:
    if df[i].dtype!='object':
        tr=Data_trend(df[i])
        col.append(i)
        trend.append(tr)
trend_=pd.DataFrame({'Cols': col, 'DataTrend': trend})
trend_
Out[22]:
Cols DataTrend
0 person_age Data Trend equel : Right
1 person_income Data Trend equel : Right
2 person_emp_length Data Trend equel : Right
3 loan_amnt Data Trend equel : Right
4 loan_int_rate Data Trend equel : Left
5 loan_percent_income Data Trend equel : Right
6 cb_person_cred_hist_length Data Trend equel : Right
7 loan_status Data Trend equel : Right
In [23]:
cor=df.corr(numeric_only=True).T.style.background_gradient(cmap='viridis', axis=1)
cor
Out[23]:
  person_age person_income person_emp_length loan_amnt loan_int_rate loan_percent_income cb_person_cred_hist_length loan_status
person_age 1.000000 0.102176 0.121276 0.050378 0.009653 -0.031975 0.874260 -0.001130
person_income 0.102176 1.000000 0.164042 0.310942 -0.057611 -0.280314 0.082727 -0.169956
person_emp_length 0.121276 0.164042 1.000000 0.092046 -0.101910 -0.065824 0.102842 -0.100428
loan_amnt 0.050378 0.310942 0.092046 1.000000 0.113582 0.647266 0.045720 0.144982
loan_int_rate 0.009653 -0.057611 -0.101910 0.113582 1.000000 0.152201 0.007535 0.338948
loan_percent_income -0.031975 -0.280314 -0.065824 0.647266 0.152201 1.000000 -0.023202 0.378280
cb_person_cred_hist_length 0.874260 0.082727 0.102842 0.045720 0.007535 -0.023202 1.000000 -0.003030
loan_status -0.001130 -0.169956 -0.100428 0.144982 0.338948 0.378280 -0.003030 1.000000

Exploratory Analysis

colunm by colunm Numeric_only¶

In [26]:
# colunm by colunm Numeric_only
for i in col:
    print("Analysis : ",i)
    data_trend=Data_trend(df[i])
    print(data_trend)
    fig=px.box(df[i],color_discrete_sequence=px.colors.qualitative.Dark24,
               template="seaborn",title='Box Person income')
    fig.show()
    fig=px.histogram(df[i],color_discrete_sequence=px.colors.qualitative.Dark24,
               template="seaborn",title='Histogram Person income')
    fig.show()
    print("#"*100)
Analysis :  person_age
Data Trend equel : Right
####################################################################################################
Analysis :  person_income
Data Trend equel : Right
####################################################################################################
Analysis :  person_emp_length
Data Trend equel : Right
####################################################################################################
Analysis :  loan_amnt
Data Trend equel : Right
####################################################################################################
Analysis :  loan_int_rate
Data Trend equel : Left
####################################################################################################
Analysis :  loan_percent_income
Data Trend equel : Right
####################################################################################################
Analysis :  cb_person_cred_hist_length
Data Trend equel : Right
####################################################################################################
Analysis :  loan_status
Data Trend equel : Right
####################################################################################################

Count VS Object columns¶

In [28]:
person_home_ownership_group=df['person_home_ownership'].value_counts()
fig= px.bar( x=person_home_ownership_group.index , y =person_home_ownership_group , text=person_home_ownership_group ,
            color_discrete_sequence=px.colors.qualitative.Dark24,template='seaborn',title='Distribution of person home ownership' )
fig.show()
In [29]:
loan_intent_group=df['loan_intent'].value_counts()
fig=px.bar(x=loan_intent_group.index , y =loan_intent_group,text=loan_intent_group,
           color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
          title='Distribution of Loan Intent')
fig.show()
In [30]:
loan_grade_group=df['loan_grade'].value_counts()
fig=px.bar(x=loan_grade_group.index , y =loan_grade_group,text=loan_grade_group,
           color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
          title='Distribution of loan grade')
fig.show()
In [31]:
cb_person_default_on_file_group=df['cb_person_default_on_file'].value_counts()
fig=px.bar(x=cb_person_default_on_file_group.index , y =cb_person_default_on_file_group,text=cb_person_default_on_file_group,
           color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
          title='Distribution of  cb person default on file')
fig.show()
In [32]:
loan_status_group=df.groupby('loan_status')['loan_status'].count()
fig=px.bar(x=loan_status_group.index , y=loan_status_group.values , text=loan_status_group.values ,
           color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
          title='Distribution of loan status')
fig.show()

Outliers¶

In [34]:
for i in col:
    print( i ," Distribution by Grade")
    plot.figure(figsize=(15, 6))
    sb.boxplot(data=df, x='loan_grade', y=i)
    plot.title('Loan Amount Distribution by Grade')
    plot.show()
    print("#"*100)
person_age  Distribution by Grade
####################################################################################################
person_income  Distribution by Grade
####################################################################################################
person_emp_length  Distribution by Grade
####################################################################################################
loan_amnt  Distribution by Grade
####################################################################################################
loan_int_rate  Distribution by Grade
####################################################################################################
loan_percent_income  Distribution by Grade
####################################################################################################
cb_person_cred_hist_length  Distribution by Grade
####################################################################################################
loan_status  Distribution by Grade
####################################################################################################
In [35]:
for i in col:
    print( i ," Distribution by loan intent")
    plot.figure(figsize=(15, 6))
    sb.boxplot(data=df, x='loan_intent', y=i)
    plot.title('Loan Amount Distribution by loan intent')
    plot.show()
    print("#"*100)
person_age  Distribution by loan intent
####################################################################################################
person_income  Distribution by loan intent
####################################################################################################
person_emp_length  Distribution by loan intent
####################################################################################################
loan_amnt  Distribution by loan intent
####################################################################################################
loan_int_rate  Distribution by loan intent
####################################################################################################
loan_percent_income  Distribution by loan intent
####################################################################################################
cb_person_cred_hist_length  Distribution by loan intent
####################################################################################################
loan_status  Distribution by loan intent
####################################################################################################
In [36]:
for i in col:
    print( i ," Distribution by person home ownership")
    plot.figure(figsize=(15, 6))
    sb.boxplot(data=df, x='person_home_ownership', y=i)
    plot.title('Loan Amount Distribution by person home ownership')
    plot.show()
    print("#"*100)
person_age  Distribution by person home ownership
####################################################################################################
person_income  Distribution by person home ownership
####################################################################################################
person_emp_length  Distribution by person home ownership
####################################################################################################
loan_amnt  Distribution by person home ownership
####################################################################################################
loan_int_rate  Distribution by person home ownership
####################################################################################################
loan_percent_income  Distribution by person home ownership
####################################################################################################
cb_person_cred_hist_length  Distribution by person home ownership
####################################################################################################
loan_status  Distribution by person home ownership
####################################################################################################
In [37]:
for i in col:
    print( i ," Distribution by cb person default on file")
    plot.figure(figsize=(15, 6))
    sb.boxplot(data=df, x='cb_person_default_on_file', y=i)
    plot.title('Loan Amount Distribution by cb person default on file')
    plot.show()
    print("#"*100)
person_age  Distribution by cb person default on file
####################################################################################################
person_income  Distribution by cb person default on file
####################################################################################################
person_emp_length  Distribution by cb person default on file
####################################################################################################
loan_amnt  Distribution by cb person default on file
####################################################################################################
loan_int_rate  Distribution by cb person default on file
####################################################################################################
loan_percent_income  Distribution by cb person default on file
####################################################################################################
cb_person_cred_hist_length  Distribution by cb person default on file
####################################################################################################
loan_status  Distribution by cb person default on file
####################################################################################################

Pairplot for numerical features¶

In [39]:
# Pairplot for numerical features
sb.pairplot(df)
plot.suptitle('Pairplot of Numerical Features', y=1.02)
plot.show()
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning:

The figure layout has changed to tight

Correlation Heatmap¶

In [41]:
# Correlation Heatmap
plot.figure(figsize=(12, 8))
numerical_cols = df.select_dtypes(include=[np.number]).columns
sb.heatmap(df[numerical_cols].corr(), annot=True, fmt=".2f", cmap='coolwarm')
plot.title('Correlation Heatmap')
plot.show()

Histograms for numerical features¶

In [43]:
# Histograms for numerical features
df.hist(bins=30, figsize=(15, 10))
plot.suptitle('Histograms of Numerical Features')
plot.show()

Group by age count¶

In [45]:
# Group the DataFrame by 'person_age' and count the number of occurrences of 'person_income'
loan_count_by_age = df.groupby('person_age')['person_income'].count().reset_index()

# Rename the columns for clarity
loan_count_by_age.columns = ['person_age', 'loan_count']

# Convert the grouped data into a DataFrame (this step is redundant as it's already a DataFrame)
loan_count_by_age = pd.DataFrame(loan_count_by_age)

# Create a new column 'age_group' by categorizing 'person_age' into defined bins
loan_count_by_age['age_group'] = pd.cut(loan_count_by_age['person_age'], 
                                         bins=[0, 30, 50, 100], 
                                         labels=[ 'Young Adult', 'Middle Age', 'Senior'])

# Group the data by 'age_group' and sum the 'loan_count' for each age group
loan_count_by_age = loan_count_by_age.groupby('age_group')['loan_count'].sum().reset_index()

# Display the final DataFrame with loan counts by age group
loan_count_by_age=pd.DataFrame(loan_count_by_age)
loan_count_by_age
fig=px.bar(y=loan_count_by_age['loan_count'],x=loan_count_by_age['age_group'],text=loan_count_by_age['loan_count'],
         color_discrete_sequence=px.colors.qualitative.Dark24,template="seaborn",
         title='Distribution of loan_count by age_group' )
fig.show()

Analysis Loan Status

In [47]:
object_columns = df.select_dtypes(include='object').columns
numeric_columns =df.select_dtypes(include=['int', 'float64']).columns
In [48]:
for column in object_columns:
    plot.figure(figsize=(12, 5))
    order = sorted(df[column].unique())
    sb.countplot(data=df, x=column,hue='loan_status',order=order)
    plot.title(f'Countplot of {column}')
    plot.tight_layout() 
    plot.show()
In [49]:
for column in numeric_columns:
    sb.boxplot(x='loan_status', y=column, data=df)
    plot.xlabel('Loan Status')
    plot.ylabel(column)
    plot.title('Box Plot of {} vs Loan Status'.format(column))
    plot.show()
In [50]:
viridis_colors = {
    0: '#440154',  
    1: '#3B528B'  
}
g = sb.FacetGrid(df, col="loan_status", hue="loan_status", height=5, aspect=1.5, palette=viridis_colors)
g.map(sb.histplot, 'loan_amnt', kde=True)
g.add_legend()
plot.subplots_adjust(top=0.85)
g.fig.suptitle('Loan Amount Distribution by Loan Status')
plot.show()
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning:

The figure layout has changed to tight

In [51]:
viridis_colors = {
    0: '#440154',  
    1: '#3B528B'  
}
g = sb.FacetGrid(df, col="loan_status", hue="loan_status", height=5, aspect=1.5, palette=viridis_colors)
g.map(sb.histplot, 'person_income', kde=True)
g.add_legend()
plot.subplots_adjust(top=0.85)
g.fig.suptitle('Person Income Distribution by Loan Status')
plot.show()
C:\Users\dell\anaconda3\Lib\site-packages\seaborn\axisgrid.py:118: UserWarning:

The figure layout has changed to tight

THANK YOU